setwd("/mnt/picea/projects/aspseq/jfelten/T89-Laccaria-bicolor/Salmon")
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(LSD))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")
pal <- brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/UPSCb/projects/T89-Laccaria-bicolor/doc/Samples.csv") %>%
mutate(Time=factor(Time)) %>%
mutate(Experiment=factor(Experiment))
## Parsed with column specification:
## cols(
## SciLifeID = col_character(),
## SampleName = col_double(),
## Time = col_double(),
## Experiment = col_character(),
## Replicate = col_character()
## )
tx2gene translation
Potra.tx2gene <- read_delim("/mnt/picea/storage/reference/Populus-tremula/v1.1/annotation/tx2gene.tsv",
"\t",col_names=c("TXID","GENE"))
## Parsed with column specification:
## cols(
## TXID = col_character(),
## GENE = col_character()
## )
Potri.tx2gene <- read_delim("/mnt/picea/storage/reference/Populus-trichocarpa/v3.0/annotation/tx2gene.tsv",
"\t",col_names=c("TXID","GENE"))
## Parsed with column specification:
## cols(
## TXID = col_character(),
## GENE = col_character()
## )
lb.filelist <- list.files("Lacbi2",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Select the samples containing fungi
stopifnot(all(str_which(basename(lb.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(lb.filelist) <- samples$SciLifeID
lb.filelist <- lb.filelist[samples$Experiment %in% c("ECM","FLM")]
Read the expression at the gene level (there is one transcript per gene)
lb.g <- suppressMessages(tximport(files = lb.filelist,
type = "salmon",txOut=TRUE))
counts <- round(lb.g$counts)
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "24.6% percent (5617) of 22822 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(lb.filelist),samples$SciLifeID),]),
aes(x,y,col=Experiment,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 5617 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>%
mutate(Time=samples$Time[match(ind,samples$SciLifeID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 294928 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 294928 rows containing non-finite values (stat_density).
dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Lacbi-raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples[s.sel,],
design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Lacbi-dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is relatively variable (50 to 300 %)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11915_101 | P11915_102 | P11915_103 | P11915_104 | P11915_105 | P11915_106 |
|---|---|---|---|---|---|
| 0.9883 | 3.221 | 3.189 | 1.53 | 1.51 | 1.907 |
| P11915_107 | P11915_112 | P11915_113 | P11915_114 | P11915_115 | P11915_116 |
|---|---|---|---|---|---|
| 1.939 | 1.641 | 0.5662 | 0.5859 | 1.38 | 1.465 |
| P11915_117 | P11915_118 | P11915_123 | P11915_124 | P11915_125 | P11915_126 |
|---|---|---|---|---|---|
| 1.102 | 0.7441 | 0.8141 | 0.4824 | 1.64 | 0.5933 |
| P11915_127 | P11915_128 | P11915_129 | P11915_134 | P11915_135 | P11915_136 |
|---|---|---|---|---|---|
| 0.5453 | 0.735 | 0.6828 | 0.9994 | 1.221 | 0.8101 |
| P11915_137 | P11915_138 | P11915_139 | P11915_140 | P11915_145 | P11915_146 |
|---|---|---|---|---|---|
| 0.5967 | 1.341 | 0.7234 | 1.153 | 1.797 | 0.8721 |
| P11915_147 | P11915_148 | P11915_149 | P11915_150 | P11915_151 |
|---|---|---|---|---|
| 1.078 | 0.6854 | 0.6586 | 0.4662 | 0.8963 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked very well, the data is nearly homoscesdastic
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate clearly both by Experiment and Time in the first 2 components.
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)[s.sel]],
pch=c(17,19)[as.integer(samples$Experiment)[s.sel]-1])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19),
legend=levels(samples$Experiment)[-1])
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples[s.sel,])
ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise A cutoff at a VST value of 1 leaves about 15000 genes, which is adequate for the QA
conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
heatmap.2(t(scale(t(vst[sels[[2]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]
The subset is enriched for higher expression values
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering remains the same even for the most variable genes
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]
The subset is enriched for higher expression values, with a strinkingly normal distribution
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes shows a separation by experiment and time point
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The quality of the data is good. The PCA shows that the samples cluster by experiment and time, however, the heatmap shows a clustering that correlates with the mapping rates, i.e. the mixed amount of reads originating from either organism. The final heatmap seem to indicate that this effect is neglectable albeit confounded.
pt.filelist <- list.files("Potra01",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Select the samples containing fungi
stopifnot(all(str_which(basename(pt.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(pt.filelist) <- samples$SciLifeID
pt.filelist <- pt.filelist[samples$Experiment %in% c("Cont","ECM")]
Read the expression at the gene level (there is one transcript per gene)
pt.g <- suppressMessages(tximport(files = pt.filelist,
type = "salmon",
tx2gene=Potra.tx2gene))
counts <- round(pt.g$counts)
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "13.8% percent (4684) of 34043 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(pt.filelist),samples$SciLifeID),]),
aes(x,y,col=Experiment,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 4684 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>%
mutate(Time=samples$Time[match(ind,samples$SciLifeID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 486035 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 486035 rows containing non-finite values (stat_density).
dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Potra-raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples[s.sel,],
design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Potra-dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is highly variable (20 to 500 %)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11915_104 | P11915_105 | P11915_106 | P11915_107 | P11915_108 | P11915_109 |
|---|---|---|---|---|---|
| 0.1408 | 0.3295 | 0.2518 | 0.4736 | 3.867 | 2.97 |
| P11915_110 | P11915_111 | P11915_115 | P11915_116 | P11915_117 | P11915_118 |
|---|---|---|---|---|---|
| 3.018 | 1.813 | 0.2348 | 0.356 | 0.1237 | 0.1678 |
| P11915_119 | P11915_120 | P11915_121 | P11915_122 | P11915_126 | P11915_127 |
|---|---|---|---|---|---|
| 1.918 | 2.647 | 1.968 | 2.015 | 1.306 | 0.2046 |
| P11915_128 | P11915_129 | P11915_130 | P11915_131 | P11915_132 | P11915_133 |
|---|---|---|---|---|---|
| 0.5862 | 0.4512 | 2.514 | 2.044 | 3.149 | 2.801 |
| P11915_137 | P11915_138 | P11915_139 | P11915_140 | P11915_141 | P11915_142 |
|---|---|---|---|---|---|
| 1.614 | 0.7794 | 0.1832 | 0.3401 | 2.288 | 3.813 |
| P11915_143 | P11915_144 | P11915_148 | P11915_149 | P11915_150 | P11915_151 |
|---|---|---|---|---|---|
| 4.682 | 1.432 | 1.501 | 0.5248 | 1.58 | 0.8726 |
| P11915_152 | P11915_153 | P11915_154 | P11915_155 |
|---|---|---|---|
| 1.795 | 1.866 | 2.409 | 1.501 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)[s.sel]],
pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19),
legend=levels(samples$Experiment)[-3])
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples[s.sel,])
ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA
conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
heatmap.2(t(scale(t(vst[sels[[3]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]
The subset is enriched for higher expression values
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering shows a better clustering by time and experiment, even though there are still outliers
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]
The subset is enriched for higher expression values, with a strinkingly normal distribution
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes is very similar to the overall profile
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.
pt.filelist <- list.files("Potri03",
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Select the samples containing fungi
stopifnot(all(str_which(basename(pt.filelist),samples$SciLifeID) == 1:nrow(samples)))
names(pt.filelist) <- samples$SciLifeID
pt.filelist <- pt.filelist[samples$Experiment %in% c("Cont","ECM")]
Read the expression at the gene level (there is one transcript per gene)
pt.g <- suppressMessages(tximport(files = pt.filelist,
type = "salmon",
tx2gene=Potri.tx2gene))
counts <- round(pt.g$counts)
Check how many genes are never expressed
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "19.5% percent (8064) of 41270 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(pt.filelist),samples$SciLifeID),]),
aes(x,y,col=Experiment,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 8064 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Experiment=samples$Experiment[match(ind,samples$SciLifeID)]) %>%
mutate(Time=samples$Time[match(ind,samples$SciLifeID)])
Color by Experiment
ggplot(dat,aes(x=values,group=ind,col=Experiment)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 744113 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 744113 rows containing non-finite values (stat_density).
dir.create(file.path("..","analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file="../analysis/salmon/Potri-raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
s.sel <- match(colnames(counts),samples$SciLifeID)
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples[s.sel,],
design = ~ Experiment * Time)
## converting counts to integer mode
## factor levels were dropped which had no samples
save(dds,file="../analysis/salmon/Potri-dds.rda")
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is highly variable (20 to 500 %)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11915_104 | P11915_105 | P11915_106 | P11915_107 | P11915_108 | P11915_109 |
|---|---|---|---|---|---|
| 0.1389 | 0.3277 | 0.2512 | 0.4697 | 3.833 | 2.954 |
| P11915_110 | P11915_111 | P11915_115 | P11915_116 | P11915_117 | P11915_118 |
|---|---|---|---|---|---|
| 2.999 | 1.803 | 0.2367 | 0.3513 | 0.125 | 0.1719 |
| P11915_119 | P11915_120 | P11915_121 | P11915_122 | P11915_126 | P11915_127 |
|---|---|---|---|---|---|
| 1.905 | 2.672 | 1.986 | 2.032 | 1.304 | 0.2037 |
| P11915_128 | P11915_129 | P11915_130 | P11915_131 | P11915_132 | P11915_133 |
|---|---|---|---|---|---|
| 0.5915 | 0.4549 | 2.5 | 2.056 | 3.142 | 2.786 |
| P11915_137 | P11915_138 | P11915_139 | P11915_140 | P11915_141 | P11915_142 |
|---|---|---|---|---|---|
| 1.649 | 0.793 | 0.1836 | 0.337 | 2.255 | 3.818 |
| P11915_143 | P11915_144 | P11915_148 | P11915_149 | P11915_150 | P11915_151 |
|---|---|---|---|---|---|
| 4.701 | 1.414 | 1.504 | 0.5314 | 1.572 | 0.8793 |
| P11915_152 | P11915_153 | P11915_154 | P11915_155 |
|---|---|---|---|
| 1.786 | 1.852 | 2.44 | 1.491 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked adequately
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
This looks interesting as the sample separate also both by Experiment and Time in the first 2 components, but somewhat not as clearly as for Laccaria bicolor
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)[s.sel]],
pch=c(17,19)[as.integer(samples$Experiment)[s.sel]])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19),
legend=levels(samples$Experiment)[-3])
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples[s.sel,])
ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Experiment)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise A cutoff at a VST value of 2 leaves about 14000 genes, which is adequate for the QA
conds <- factor(paste(samples$Experiment,samples$Time))[s.sel]
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
heatmap.2(t(scale(t(vst[sels[[3]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],]),decreasing=TRUE) [1:1000]
The subset is enriched for higher expression values
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering shows a better clustering by time and experiment, even though there are still outliers
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[2]],])) [1:1000]
The subset is enriched for higher expression values, with a strinkingly normal distribution
ggplot(list(sub=rowMeans(vst[sels[[2]],][ord,]),
total=rowMeans(vst[sels[[2]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes is very similar to the overall profile
heatmap.2(t(scale(t(vst[sels[[2]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The same conclusion as for Laccaria bicolor apply, despite the difference in the heatmap behaviour that may indicate some bias due to the organisms.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.52.0 tximport_1.12.3
## [3] forcats_0.4.0 stringr_1.4.0
## [5] dplyr_0.8.3 purrr_0.3.2
## [7] readr_1.3.1 tidyr_0.8.3
## [9] tibble_2.1.3 tidyverse_1.2.1
## [11] scatterplot3d_0.3-41 RColorBrewer_1.1-2
## [13] pander_0.6.3 LSD_4.0-0
## [15] hyperSpec_0.99-20180627 ggplot2_3.2.1
## [17] lattice_0.20-38 gplots_3.0.1.1
## [19] DESeq2_1.24.0 SummarizedExperiment_1.14.1
## [21] DelayedArray_0.10.0 BiocParallel_1.18.1
## [23] matrixStats_0.54.0 Biobase_2.44.0
## [25] GenomicRanges_1.36.0 GenomeInfoDb_1.20.0
## [27] IRanges_2.18.2 S4Vectors_0.22.0
## [29] BiocGenerics_0.30.0 data.table_1.12.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 htmlTable_1.13.1 XVector_0.24.0
## [4] base64enc_0.1-3 rstudioapi_0.10 hexbin_1.27.3
## [7] affyio_1.54.0 bit64_0.9-7 AnnotationDbi_1.46.1
## [10] lubridate_1.7.4 xml2_1.2.2 splines_3.6.1
## [13] geneplotter_1.62.0 knitr_1.24 zeallot_0.1.0
## [16] Formula_1.2-3 jsonlite_1.6 broom_0.5.2
## [19] annotate_1.62.0 cluster_2.1.0 BiocManager_1.30.4
## [22] compiler_3.6.1 httr_1.4.1 backports_1.1.4
## [25] assertthat_0.2.1 Matrix_1.2-17 lazyeval_0.2.2
## [28] limma_3.40.6 cli_1.1.0 acepack_1.4.1
## [31] htmltools_0.3.6 tools_3.6.1 gtable_0.3.0
## [34] glue_1.3.1 GenomeInfoDbData_1.2.1 affy_1.62.0
## [37] reshape2_1.4.3 Rcpp_1.0.2 cellranger_1.1.0
## [40] vctrs_0.2.0 preprocessCore_1.46.0 gdata_2.18.0
## [43] nlme_3.1-141 xfun_0.9 testthat_2.2.1
## [46] rvest_0.3.4 gtools_3.8.1 XML_3.98-1.20
## [49] zlibbioc_1.30.0 scales_1.0.0 hms_0.5.1
## [52] yaml_2.2.0 memoise_1.1.0 gridExtra_2.3
## [55] rpart_4.1-15 latticeExtra_0.6-28 stringi_1.4.3
## [58] RSQLite_2.1.2 highr_0.8 genefilter_1.66.0
## [61] checkmate_1.9.4 caTools_1.17.1.2 rlang_0.4.0
## [64] pkgconfig_2.0.2 bitops_1.0-6 evaluate_0.14
## [67] labeling_0.3 htmlwidgets_1.3 bit_1.1-14
## [70] tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
## [73] R6_2.4.0 generics_0.0.2 Hmisc_4.2-0
## [76] DBI_1.0.0 pillar_1.4.2 haven_2.1.1
## [79] foreign_0.8-72 withr_2.1.2 survival_2.44-1.1
## [82] RCurl_1.95-4.12 nnet_7.3-12 modelr_0.1.5
## [85] crayon_1.3.4 KernSmooth_2.23-15 rmarkdown_1.15
## [88] locfit_1.5-9.1 readxl_1.3.1 blob_1.2.0
## [91] digest_0.6.20 xtable_1.8-4 munsell_0.5.0